/*==============================================================================
CLEANING OF NOTIFICATION DATA

Data 	: DS_Lev_AF_hist_varsel.dta
Folder 	: rawdata/SCB_Leverans_2016/Leverans 7 Jonas/DS_Lev_AF_hist_varsel.dta
Date	: 2017-04-25

Creator		: Jonas Cederlof	(JC)
Description :	
Notes		: 

LATEST UPDATE : 2017-05-10

==============================================================================*/

clear
set more			 off
cap   log close 		_all

log using 	"../log/A1_clean_varseldata.log", replace 
use 		"$rawdatapath/DS_Lev_AF_hist_varsel.dta"
*Note: each notification has multiple entires as the PES updates new information
*in new rows. Thus we start iut with about 1.4mil obs but after keeping the last
*entry (which contain all information) we are left with arounf 400K obs. 

{ // Destring and rename a variables such that second dataset can be appeneded properly
*===============================================================================
foreach var of varlist 		persfortin_dat 	{
	gen 		year 			= substr(`var',1,4)	
	gen 		month 			= substr(`var',6,2)
	gen 		day 			= substr(`var',9,2)
	destring 	year month day		, replace

	gen 		temp_`var' = mdy(month , day , year) 
	format 		temp_`var' 		%td

	drop 		year month day 		`var'
	rename 		temp_`var'	 		`var'
}


}
*
{ // Merging with indiviual data
*===============================================================================

merge 1:m lopnr_varsellnr lopnr_hist_varsellnr using "$rawdatapath/DS_Lev_AF_hist_varselperson.dta"

* Don't worry about _merge==1 as these are mostly early observations for firms
* that have not yet sent in a list of people. See that for "all" notfications
* they start with _merge==1 at a firm and then becomes _merge==3 when the list 
* is registered at the PES. For e.g. lopnr_varsellnr==9 we have a recall so
* no list is registered and hence no _merge==3.

*The _merge==2 are indivuals that are notified but the varselid does not exist 
*in the notification data on (firms DS_Lev_AF_hist_varsel.dta).

sort lopnr_varsellnr tr_dat
*list lopnr_varsellnr lopnr_hist_varsellnr peorglopnr tr_dat varselstatus _merge in 1/1000

keep if _merge==3
drop 	_merge
}
*


{ // Merging with cause of notification
*===============================================================================
*There are some more obs in using data for some reason.
merge m:1 lopnr_varsellnr using  "$datapath/A1_varselorsak.dta"
keep if _merge==3
drop _merge

}
*


// Append 2016-2019 notifications
*Append year 2016-2019
preserve
	use "$rawdatapath/varsel_ftg_20152019.dta",clear
	
	*Merge on workers
	merge 1:m lopnr_varsellnr lopnr_hist_varsellnr using "$rawdatapath/varsel_ind_20152019.dta"
	keep if _merge==3 // again no worries for _merge==1
	drop _merge
	
	gen year = year(ankomst_dat)
	keep if inrange(year,2016,2019)
	drop year
	
	*Rename to correspond to names in other data set
	rename lopnr_peorgnr 		peorglopnr
	rename lopnr_cfarnr 		cfarlopnr 
	rename varselarbglnr 		lopnr_varselarbglnr 

	*There are some more obs in using data for some reason.
	merge m:1 lopnr_varsellnr using  "$datapath/A1_varselorsak_16_19.dta"
	keep if _merge==3
	drop _merge

	gen newvarseldata = 1
	*Save data
	tempfile temp
	save `temp'
restore
append using `temp',force



{ // Rename, encoding, destring and labeling variables & values 
********************************************************************************

// 		Rename variables
*===============================================================================
rename 		lopnr			persid
rename 		peorglopnr 		firmid
rename 		lopnr_varselarbglnr 	firm_varsel_id
rename 		lopnr_varsellnr 	varselid 
rename 		lopnr_hist_varsellnr  	varsel_postid
rename 		cfarlopnr 		plantid

rename 		persfortin_dat	 	list_dat			
rename 		driftinskr_dat		firstgo_dat
rename 		atertag_dat		recall_dat

rename	 	antvar_tjm		novarsel_wcwo
rename 		antvar_kollanst	 	novarsel_coll
rename 		antvar_ovr		novarsel_rest

rename	 	antanst_man		noemp_men
rename	 	antanst_kvinnor		noemp_wom
rename 		antanst_kollanst	noemp_coll
rename 		antanst_ovr		noemp_rest
rename 		antanst_tjm		noemp_wcwo


// Destring date variable (only one date variable is not Stata format)
*===============================================================================
foreach var of varlist 		 	anstupphor_dat{
	gen 		year 			= substr(`var',1,4)	
	gen 		month 			= substr(`var',6,2)
	gen 		day 			= substr(`var',9,2)
	destring 	year month day		, replace

	gen 		temp_`var' = mdy(month , day , year) 
	format 		temp_`var' 		%td

	drop 		year month day 		`var'
	rename 		temp_`var'	 		`var'
}
rename  anstupphor_dat		 	anstupp_dat			

// Encode string values to numeric
*===============================================================================

foreach var of varlist  varselkat varselstatus komplettvarsel fullpersfort ///
			varselflytt varseltyp mbl11 forhpart_sakn inkomi_tid ///
			regelavsteg {
							
	encode 		`var', gen(temp_`var') 
	drop 		`var'
	rename 		temp_`var' 	`var'
}
*Manually fix wierd letters causing some issues
tab varselstatus
tab varselstatus,nol
replace varselstatus=5 if varselstatus==6
replace varselstatus=7 if varselstatus==8
tab varselstatus

tab varselflytt
replace varselflytt= 2 if varselflytt==3
replace varselflytt= 2 if varselflytt==4
replace varselflytt= 5 if varselflytt==6

tab varseltyp
tab varseltyp,nol
replace varseltyp=3 if varseltyp==4

// Changing label on binary variables 
*===============================================================================
foreach var of varlist 	komplettvarsel fullpersfort mbl11 forhpart_sakn ///
			inkomi_tid regelavsteg{
	recode 		`var'	(1=1)  (2=0)
	label def 	`var'	0 "No" 1 "Yes"
	label val 	`var' `var'
}


// Replace missing values with zeros in 'Number of' variables
*===============================================================================
foreach var of varlist 	novar* noemp* {
	replace `var'=0 if `var'==.
}

// 		Label variables
*===============================================================================
lab var 	firmid			"ID: Firm"	
lab var 	persid 			"ID: individual"						
lab var 	firm_varsel_id		"ID: for employer who lay a particular varsel"
lab var 	varselid 		"ID: for the particular varsel"
lab var 	varsel_postid 		"ID: for an observation in a particular varsel"
lab var 	plantid			"ID: Plant/Workplace"
lab var 	ansvkontor		"ID: Office responsible for the varsel"

lab var 	tr_dat			"Timestamp: When the varsel-post was entered in the system"
lab var 	ankomst_dat		"Timestamp: When varsel arrives to 'Lansarbetsnamden'"
lab var 	list_dat		"Timestamp: When list of workers getting laid of is received"
lab var 	recall_dat		"Timestamp: When/if the varsel was retracted/recalled"
lab var 	avslut_dat		"Timestamp: When the varsel is completed"
lab var 	bekraft_dat		"Timestamp: When the conformation was sent to the employer"
lab var 	firstgo_dat		"Timestamp: The first person is let go"
lab var 	anstupp_dat		"Timestamp: displacement date"
lab var 	inkom_dat		"Timestamp: when advance notice is given"

lab var 	komplettvarsel		"Flag: Complete varsel"	    		     			// Meaning exactly?!
lab var 	fullpersfort		"Flag: Complete list of personel"
lab var 	mbl11			"Flag: Negotiation according to MBL pargf.11 law"
lab var 	forhpart_sakn		"Flag: Negotiation partner missing?"	
lab var 	inkomi_tid		"Flag: Was the varsel was handed in on time? "
lab var 	regelavsteg		"Flag: Side-stepping the rules"

lab var 	lan 			"Code: County in which the Employer is active"
lab var 	varselflytt		"Code: for/if frim is moving and where"
lab var 	varselstatus		"Code: for Varsel status"
lab var 	varseltyp		"Code: Type of varsel"
lab var 	kommun			"Code: Municipality"
lab var 	sni2002			"Code: Industry code (SCB 2002)"
lab var 	sni2007			"Code: Industry code (SCB 2007)"
lab var 	jurform			"Code: Legal coorparate form"

lab var 	novarsel_wcwo		"Number of varslade with collar workers" 
lab var 	novarsel_coll		"Number of varslade with collective agreement"
lab var 	novarsel_rest		"Number of varslade rest (SWE: ovriga)"

lab var 	noemp_men		"Number of employed women"
lab var 	noemp_wom		"Number of employed men"
lab var 	noemp_coll		"Number of employed with collective agreement" 
lab var 	noemp_rest		"Number of employed rest (SWE: orviga)"
lab var 	noemp_wcwo		"Number of employed white-collar workers (tjansteman)"

lab var 	varselkat		"Category of employment (coarse)" 		
lab var 	varselstatus	 	"Code: for Varsel status"


}	
* 


{ // Checking concistency of variables
********************************************************************************
 
{ //  Varsel ID (varselid) 
*========================
count 	if 	varselid==. 			


preserve
	collapse firmid ,  by(varselid)
	duplicates r
restore
}
*Comment: 	The same varsel does not appear in different firms
{ // Firm varsel ID  (firm_varsel_id)
*=================================
count 	if 	firm_varsel_id==.

*If by knowing the varselid you also know the firm as seen above (if firmid!=.). 
*Then the firm_varsel_id must be redunant information.


*Is firm_varsel_id just a multiple of varselid and hence convey the same information?
bys 	firm_varsel_id varselid  : gen 	nvals 			=_n==1
bys 	firm_varsel_id		 : egen uniqe_varsel		= sum(nvals)  	// Summing the number of uniqe values of varselid by firm_varsel_id
bys 	varselid 		 : egen uniqe_firm_varsel_id	= sum(nvals) 	// Summing the number of uniqe values of firm_varsel_id by varselid

sum 		uniqe_firm_varsel_id 	uniqe_varsel
qui	drop	uniqe_firm_varsel_id 	uniqe_varsel nvals

drop	 	firm_varsel_id		 
}
*Comment: 	The variables firm_varsel_id & varselid convey the exact same 
*		information. They are just multiples of each other and hence 
*		firm_varsel_id IS DROPPED.
{ // Number of employed Gender
*===========================
sum 	    noemp_men	 	noemp_wom 
count 	if  noemp_men==0 & 	noemp_wom==0

*Replacing "double zeroes" with missing values
replace 	noemp_men = . 	if noemp_men==0 & noemp_wom==0	
replace 	noemp_wom = . 	if noemp_men==. & noemp_wom==0	

* How stable gender composition is within each varsel? 
bys varselid (noemp_men tr_dat) : gen change_men = noemp_men[_n+1] - noemp_men[_n]   if noemp_wom!=.
bys varselid (noemp_wom tr_dat) : gen change_wom = noemp_wom[_n+1] - noemp_wom[_n]   if noemp_men!=.

tab  change*
drop change*
}
*Comment: 	More than 99% of the sample does not change gender compoisiton within 
*		a varsel
{ // Total number of employed  
*=============================== 
gen 		noemp_tot_gender =  noemp_men  + noemp_wom 			// calculated from gender frequency variables
gen 		noemp_tot 	 =  noemp_coll + noemp_rest + noemp_wcwo 	// calculated as the sum of employed in each work categories 

compare  	noemp_tot_gender 	noemp_tot 	if noemp_tot_gender!=.

drop  	noemp_tot_gender
lab var noemp_tot 	 "Number of employed in total (FIRM DATA : coll+rest+wcwo)"
}
*Comment: 	For the number of people working at a firm it seemes that summing
* 	  	based on workers category is more relaible than summing based on 
* 	  	gender.
{ // Number of notified indivuals (based on firm registration) 
*=============================================================
egen 	novars_tot = rowtotal(novarsel_wcwo novarsel_coll novarsel_rest)
sum 	novars_tot
lab var novars_tot 	 "Number of varslade in total (FIRM DATA : coll+rest+wcwo)"


*The number of varslade should not exceed the number of employees at the firm
bys 	varselid : 		gen  flag = (novars_tot > noemp_tot)
sum	flag	 						
drop 	flag
}
*
{ // Do firms vary inkom_dat or anstupp_dat between indivuals

bys varselid : gen x_diff_inkom_dat=	inkom_dat!=inkom_dat[_n+1] if inkom_dat!=. & inkom_dat[_n+1]!=.
bys varselid : gen x_diff_anstupp_dat=	anstupp_dat!=anstupp_dat[_n+1] if anstupp_dat!=. & anstupp_dat[_n+1]!=.

preserve
	collapse (max) x_* , by(varselid)
	sum x_*
restore

drop x_*

}
*Comment : 	8 % of displacements have differences in notification date between
*		indivuals within a discplaments. The same number of differences
*		in last day of employment is 83 percent.

{ // Changes of individual notification and end dates within displacement
*===============================================================================
foreach var of varlist inkom_dat anstupp_dat {
	bys varselid persid (tr_dat) : 	gen x_diff_`var' = `var'[_n+1] - `var'[_n] ///
					if    `var'[_n+1]!=. & `var'[_n]!=.
	replace x_diff_`var' =. if x_diff_`var'==0 
	bys varselid persid : egen x_max_diff_`var'_binary= max(x_diff_`var')	
}
preserve
	collapse (max) x_diff_inkom_dat x_diff_anstupp_dat, by(persid varselid)
	sum x_*
restore	
sum x_*
drop x_*



}
*Comment:	Few indivuals have changes in their date of notification or last
*		day of employment. 1,906 and 5,519 respectivly. This amounts to
*		about 0.4% and 1.1% of the sample. Note that this is analysis is
*		before correcting for large outliers (obvious wrong enteries, see
*		code below). 

{ // Updates of notification list
*================================
preserve
	drop if list_dat==.
	duplicates drop persid varselid list_dat,force 		// so that indivuals are uniqe for each list
	collapse (max) fullpers (count) persid, by(varselid list_dat)
	
bys varselid: gen diff_freq = persid[_n+1] - persid if persid[_n+1]!=. 

gen temp = diff_freq!=.
replace temp =. if diff_freq==0
collapse (max) temp, by(varselid)

sum temp
restore
}
*Comment: 	about 1.2 percent update their list by adding or withdrawing 
*		names from it. I.e. changes the number of people getting displaced

}
*
{ // Keeping last registered observation for each indiviual within a varselid
********************************************************************************
bys varselid persid (tr_dat) : gen lastobs = _n== _N
sort varselid tr_dat varsel_postid persid
*list varselid varsel_postid tr_dat inkom_dat list_dat firstgo_dat  persid inkom_dat anstupp_dat lastobs in 1/1000, sepby(varsel_postid)

keep if lastobs==1
drop 	lastobs
tab 	varselstatus


}
*
{ // Correcting outliers in frim level 'Timestamp' variables
********************************************************************************
* I will not flag observations here but merley correct obvious wrong enteries.


*Create a year variable for each Timestamp
foreach var of varlist 	ankomst_dat bekraft_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat{
	gen 	year_`var' = year(`var')
}
*=======================
* FIRM VARIABELS
*=======================
*Ankomst_dat
	tab year_ankomst_dat		// OK 	(very few obs in 2004)
	
*Bekraft_dat	
	tab year_bekraft_dat	 	// OK 	(very few obs in 2004)

*List_dat	
	tab year_list_dat
	
	*Listing suspisous year
	foreach year of numlist 	1004 2016 2017 2020 2918 2918{
		list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus  		if year_list_dat==`year' , sepby( varselid)
	}
	*Replace
	replace year_list_dat = 2004 		if year_list_dat == 1004 
	replace year_list_dat = 2018 		if year_list_dat == 2918 
	replace year_list_dat = 2019 		if year_list_dat == 2919


	replace list_dat = mdy(month(list_dat), day(list_dat), year_list_dat)


*Firstgo_dat (Not a very important variable)
	tab year_firstgo_dat 

	foreach year of numlist  2029 2031 2102 2103 2206  2025 2030 2031 2106 2109{		
		list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus   	if year_firstgo_dat==`year' , sepby( varselid)
	}	
	replace year_firstgo_dat = 2006 	if year_firstgo_dat==2206
	replace year_firstgo_dat = 2013 	if year_firstgo_dat==2103
	replace year_firstgo_dat = 2012 	if year_firstgo_dat==2102
	replace year_firstgo_dat = 2017 	if year_firstgo_dat==2030
	replace year_firstgo_dat = . 		if year_firstgo_dat==2031
	replace year_firstgo_dat = 2016 	if year_firstgo_dat==2106
	replace year_firstgo_dat = 2019 	if year_firstgo_dat==2109
	replace firstgo_dat = mdy(month(firstgo_dat), day(firstgo_dat), year_firstgo_dat)

*Avslut_dat
	tab year_avslut_dat
	
	foreach i of numlist 2031 2088 2099 2103{	
		list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus 	 if year_avslut_dat==`i' , sepby( varselid)
	}	
	replace year_avslut_dat = 2013 		if year_avslut_dat==2031
	replace year_avslut_dat = 2008 		if year_avslut_dat==2088
	replace year_avslut_dat = 2009 		if year_avslut_dat==2099
	replace year_avslut_dat = 2013 		if year_avslut_dat==2103
	replace avslut_dat 	 	= mdy(month(avslut_dat), day(avslut_dat), year_avslut_dat)

	
*=======================
* INDIVIUAL VARIABELS 	(More important!)	
*=======================	

misstable 	summarize		 anstupp_dat inkom_dat 			 // 1 missing observations in anstupp_dat
		summarize		 anstupp_dat inkom_dat, d  format	 // Some extreme values
			
			
*NOTE: 	In context with other indiviuals within the same varsel_postid 
*		it should be clear that some are just wrong enteries like a 9 is 
*		supposed to be a 0 (keys next to each other) and so on.


****Inkom_dat
tab year_inkom_dat

*	Based on the tabulation I would guess that complete data is
*	avalibale between 2005 and 2015. 		  

*Listing wrong enteries
foreach year of numlist 2003 2016 2088 2102 2208 2209{
	display "THE YEAR TO LOOK FOR IS `year'"
	bys varsel_postid : egen wrong_entry_`year' = max(year_inkom_dat==`year') 
	list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus if wrong_entry_`year'==1, sepby(varsel_postid)
	drop wrong*
}

*Replacing wrong enteries assuming they are typos
replace year_inkom_dat=2013 	if year_inkom_dat==2003
replace year_inkom_dat=2008  	if year_inkom_dat==2088 | year_inkom_dat==2208
replace year_inkom_dat=2012  	if year_inkom_dat==2102
replace year_inkom_dat=2009  	if year_inkom_dat==2209

replace inkom_dat = mdy(month(inkom_dat), day(inkom_dat), year_inkom_dat) 	



****Anstupp_dat
tab year_anstupp_dat



*Listing wrong enteries 	(Note: All levels of year_anstupp_dat have been checked but these are the ones deemed possible to correct
foreach year of numlist 606 909 910 1012 1013 1111 1212 1313 1909 1982 1986 1989 1996 1998 1999 2000 2001 2002 2003{
	display "THE YEAR TO LOOK FOR IS `year'"
	bys varsel_postid (anstupp_dat): egen wrong_entry_`year' = max(year_anstupp_dat==`year') 	
	list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus  if wrong_entry_`year'==1, sepby(varsel_postid)
	drop wrong*
}

foreach year of numlist 	2018 2019 2020 2021 2025 2028 2029 2030 2031 2040 2055 2066 2073 2077 2080 2081 2088 2090 2099 2101 2102 2103 2104 2105 2110 2112 2140 2206 2208 2209 2301 2313 2913 2914 3013 3023 5015 9009{
	display "THE YEAR TO LOOK FOR IS `year' "
	bys varsel_postid (anstupp_dat): egen wrong_entry_`year' = max(year_anstupp_dat==`year') 	
	list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus  if wrong_entry_`year'==1, sepby(varsel_postid)
	drop wrong*
}
	

*Replacing wrong enteries assuming they are typos
replace year_anstupp_dat=2006 	 	if year_anstupp_dat==2055
replace year_anstupp_dat=2006 	 	if year_anstupp_dat==606 	| year_anstupp_dat==2206 | year_anstupp_dat==2066
replace year_anstupp_dat=2007 	 	if year_anstupp_dat==2077
replace year_anstupp_dat=2008 	 	if year_anstupp_dat==2208 	| year_anstupp_dat==2088
replace year_anstupp_dat=2009	 	if year_anstupp_dat==909 	| year_anstupp_dat==1909 | year_anstupp_dat==9009 | year_anstupp_dat==2000 | year_anstupp_dat==2209 | year_anstupp_dat==2090
replace year_anstupp_dat=2010 	 	if year_anstupp_dat==910 	| year_anstupp_dat==2110 | year_anstupp_dat==2101
replace year_anstupp_dat=2012	  	if year_anstupp_dat==1012 	| year_anstupp_dat==1212 | year_anstupp_dat==2112 | year_anstupp_dat==2102 | year_anstupp_dat==2021
replace year_anstupp_dat=2013	  	if year_anstupp_dat==1013 	| year_anstupp_dat==1313 | year_anstupp_dat==1213 | year_anstupp_dat==3013 | year_anstupp_dat==2913 | year_anstupp_dat==2313 | year_anstupp_dat==2301 | year_anstupp_dat==2103
replace year_anstupp_dat=2014	  	if year_anstupp_dat==2914  	| year_anstupp_dat==2140 | year_anstupp_dat==2104
replace year_anstupp_dat=2015	  	if year_anstupp_dat==5015  	| year_anstupp_dat==2105	
	

replace year_anstupp_dat=2011 		if varselid==30945  &   varsel_postid==549    & year_anstupp_dat==2001	
replace year_anstupp_dat=2010 		if varselid==17686  &   varsel_postid==5409   & year_anstupp_dat==2001
replace year_anstupp_dat=2010 		if varselid==21784  &   varsel_postid==7205   & year_anstupp_dat==2001	
replace year_anstupp_dat=2011 		if varselid==32847  &   varsel_postid==31535  & year_anstupp_dat==2001	
replace year_anstupp_dat=2011 		if varselid==11330  &   varsel_postid==39620  & year_anstupp_dat==2001	
replace year_anstupp_dat=2011 		if varselid==7220   &   varsel_postid==80284  & year_anstupp_dat==2001	
replace year_anstupp_dat=2010 		if varselid==25271  &   varsel_postid==91481   & year_anstupp_dat==2001	
replace year_anstupp_dat=2010 		if varselid==11667  &   varsel_postid==104296   & year_anstupp_dat==2001	
replace year_anstupp_dat=2010 		if varselid==7646   &   varsel_postid==117150   & year_anstupp_dat==2001	
replace year_anstupp_dat=2011 		if varselid==6783   &   varsel_postid==119540  & year_anstupp_dat==2001	
replace year_anstupp_dat=2011 		if varselid==16465   &   varsel_postid==135874  & year_anstupp_dat==2001

replace year_anstupp_dat=2013 		if varselid==554   &   varsel_postid==6998  & year_anstupp_dat==2003	
replace year_anstupp_dat=2009 		if varselid==33568   &   varsel_postid==76011  & year_anstupp_dat==2003	
replace year_anstupp_dat=2009 		if varselid==1938   &   varsel_postid==128358  & year_anstupp_dat==2003	
replace year_anstupp_dat=2006 		if varselid==10737   &   varsel_postid==143710  & year_anstupp_dat==2003	

replace year_anstupp_dat=2013 		if varselid==10539   &   varsel_postid==140193  & year_anstupp_dat==2031	


*Replacing anstupp_dat with the corrected values
replace anstupp_dat = mdy(month(anstupp_dat), day(anstupp_dat), year_anstupp_dat) 	

* The remaning "wrong enteries" I do not dare to change. It sure looks like 
* employers type in some bullshit because they don't know when the indivual is
* to be laid off.
tab year_anstupp_dat	


foreach year of numlist 1016 1999 2031 2048 2080 2097 2096 2099 2105 2106 2108 2109 2190 2916 2919 5015{
	display "THE YEAR TO LOOK FOR IS `year'"
	bys varsel_postid (anstupp_dat): egen wrong_entry_`year' = max(year_anstupp_dat==`year') 	
	list varselid varsel_postid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat varselstatus  if wrong_entry_`year'==1, sepby(varsel_postid)
	drop wrong*
}

*Replacing wrong enteries assuming they are typos
replace year_anstupp_dat=2015 	 	if year_anstupp_dat==5015
replace year_anstupp_dat=2019 	 	if year_anstupp_dat==2919
replace year_anstupp_dat=2016 	 	if year_anstupp_dat==2916
replace year_anstupp_dat=2019 	 	if year_anstupp_dat==2190
replace year_anstupp_dat=2019 	 	if year_anstupp_dat==2109
replace year_anstupp_dat=2018 	 	if year_anstupp_dat==2108
replace year_anstupp_dat=2016 	 	if year_anstupp_dat==2106
replace year_anstupp_dat=2015 	 	if year_anstupp_dat==2105
replace year_anstupp_dat=2015 	 	if year_anstupp_dat==2105
replace year_anstupp_dat=2018 	 	if year_anstupp_dat==2048
replace year_anstupp_dat=2016 	 	if year_anstupp_dat==1016

replace year_anstupp_dat=2017 	 	if year_anstupp_dat==2099 & varselid==41786 

replace year_anstupp_dat=2020 	 	if year_anstupp_dat==2096 & varselid==45790

*Replacing anstupp_dat with the corrected values
replace anstupp_dat = mdy(month(anstupp_dat), day(anstupp_dat), year_anstupp_dat) 	


	

}
*

drop if year_anstupp_dat>2025 	
drop year*
{ // Variable for number of displaced workers
********************************************************************************
* 	Number of displaced workers calculated here can be compared with firm
* 	level data. We can do this only using 'bys varselid :' since we only have
*	one observation per individual per varsel. 

bys varselid : egen antvars = total(1)
label var antvars "Number of displaced workers within varselid (calculated from indivual data)" 


}
*
{ // Checkin inconsistency in type of notification (Drops 1,447 + 17 obs)
*===============================================================================

*Dropping temporary lay-offs (permitering)
tab varseltyp
// PEDE = Permittering, deltid
// Pehe = Permittering, heltid
// Upsa = Uppsagning
// .34% of observations are about permittering. DS think we should drop them. 
// Permittering = temporary "layoff"

*Asserts that the whole displacement is of the same varseltype 
gen temp = varseltyp == 3
egen mintemp = min(temp), by(varselid)
egen maxtemp = max(temp), by(varselid)
keep if mintemp==1  // Drops 1389 observations. 0.33 percent.
drop mintemp maxtemp temp


*Drop when number of notified workers < number of workers employed
gen temp = novars_tot<=noemp_tot 
tab temp // 17 observations are wrong.
egen mintemp = min(temp), by(varselid)
egen maxtemp = max(temp), by(varselid)
assert mintemp==maxtemp // make sure that the mistake is at the varselid-level. 
drop if temp==0 // 17 obs
drop temp maxtemp mintemp 

}
*
{ // Checking duplicates
********************************************************************************
duplicates r varselid persid


* There seems to be a problem with the same indiviual being displaced from 
* different plants(!) at the same time within the same firm.
* As we are only using variation at the firm level we want only one notification
* per firm per indiviual at a specific point in time. Basically this implies that 
* we have duplicate observations as all varaibels are equal execpt plantid.
* I drop these and keep the varselid with the largest number.
duplicates rep  	firmid persid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat 
duplicates tag	 	firmid persid tr_dat ankomst_dat list_dat firstgo_dat avslut_dat inkom_dat anstupp_dat, gen(dup)	// 242 obs deleted


*Note: 	the below command is basically the same as "duplicates drop [varlist]",
* 	but it just reassures us that we do not split up people between
* 	different varselid:s
bys firmid persid (varselid) : gen  nvals =_n if dup>0
bys firmid persid (varselid) : egen temp  = max(nvals) if dup>0
drop if dup>0 & nvals!=temp
drop nvals temp dup


* I have now dropped observations with the exact same Timestamp. However, there
* are still observations that have the same persid, firmid, and month in Timestamp.
* In order to calculate tenure at the time of the notification of displacement 
* and as people can be notified of discplament several times at the same firm, I create a
* uniqe observations by year and month of a varsel and persid in order to be able
* to merge this with the Jobb_data where I can calculate tenure at that particular 
* month.
 
count
*Generate: date variabels used to merge with jobb data and calculate tenure
gen ym_inkom_dat 		=  ym(year(inkom_dat)   , 	month(inkom_dat))

format  ym_inkom_dat %tm


* THIS IS A RESTRICTION THAT A PARTICULAR PERSON CAN NOT BE NOTIFIED OF 
* DISPLACEMENT MORE THAN ONCE AT THE SAME FIRM AT THE SAME MONTH.
duplicates report 		firmid persid ym_inkom_dat
duplicates tag 			firmid persid ym_inkom_dat, gen(dup) 

*Keeping the observation with the latest entry (tr_dat)
bys firmid persid (tr_dat) : gen nvals= _n if dup>0
bys firmid persid (varselid) : egen temp  = max(nvals) if dup>0
drop if dup>0 & nvals!=temp

drop 	   nvals temp dup       
duplicates report firmid persid ym_inkom_dat

*Generate merging variable with Jobb data
*clonevar date = ym_inkom_dat
*note 	 date : "date==ym_inkom_dat"
*drop ym_inkom_dat ym_list_dat


** Duplicates within year **
*  Note: So now I have unique notification observations at year/month/persid.
* 	 Nevertheless, I check how many notifications indiviuals have during a 
*	 year. There is one guy, Mr. 9183 who is notified in total 117 times.
*	 I drop this indiviual. Then the maximum number of notifications is in
*	 total 10 or by year 4.

gen x_year = year(inkom_dat)
duplicates report x_year persid 
duplicates report persid

*Dropping the 124 times notified individual
drop if persid==9183

duplicates report persid		// an indivual is notified at max 10 times during the sample period
duplicates report x_year persid 	// an indiviual is at max notified 4 times during a year
duplicates report x_year persid firmid  // an indivual is at max notified 3 times from the same firm during the same year


duplicates tag x_year persid firmid ,gen(dup)

*Keeping the observation with the earliest(!) entry (tr_dat)
bys firmid persid (tr_dat) : gen nvals= _n if dup>0
bys firmid persid (varselid) : egen temp  = min(nvals) if dup>0
drop if dup>0 & nvals!=temp


duplicates report x_year persid firmid

drop x_year dup temp nvals
}
*
{ // Variable for number of displaced workers
********************************************************************************
* 	Number of displaced workers calculated here can be compared with firm
* 	level data. We can do this only using 'bys varselid :' since we only have
*	one observation per individual per varsel. 

bys varselid : egen antvars2 = total(1)
label var antvars "Number of displaced workers within varselid (calculated from indivual data)" 


}
*
{ // Define notification dates & notification times
*===============================================================================

*Default notification date (year/month) based on inkom_dat
gen notdate_def = ym(year(inkom_dat), month(inkom_dat))
gen nottime_def = anstupp_dat - inkom_dat

format notdate_* %tm

*Summary
sum nottime_*,d


}
*

*Dropping variables to increase speed in future .do-files
drop  	fullpersfort   novarsel_wcwo novarsel_coll novarsel_rest ///
	noemp_men noemp_wom noemp_coll noemp_rest noemp_wcwo lan kommun ansvkontor ///
	 regelavsteg  

*The same indivual is displaced from the same firm at max 8 times
duplicates report persid firmid 
*The same indivual is displaced at max 10 times
duplicates report persid

*Generate: merging support variable used in A2_merge_RAMS_varsel.do.	
bys persid firmid : gen temp = _n

*Save data
compress
count
save 	"$datapath/A1_varseldata.dta", replace


log close




		
